global root_dir = "`1'"

include "$root_dir/code/config/config.do"


cap noi log using ${log_dir}/2_map_flow_make_sic.log, replace name(dat)

*Handle empty arguments
global arg1 = cond("`2'" == "___EMPTY___", "", "`2'")
global arg2 = cond("`3'" == "___EMPTY___", "", "`3'")
global arg3 = cond("`4'" == "___EMPTY___", "", "`4'")
global arg4 = cond("`5'" == "___EMPTY___", "", "`5'")

if "$arg1" != "" {
    global weight_category "$arg1"
    di "Weight category: ${weight_category}"
}

if "$arg2" != "" {
    global weight_versions "$arg2"
    di "Weight versions: ${weight_versions}"
}

if "$arg3" != "" {
    global weight_window "$arg3"
    di "Weight window: ${weight_window}"
}

if "$arg4" != "" {
	global wtype "$arg4"
}
di "${wtype}"
capture noi {

/* This do-file prepares a make and capital flow table and uses it to map industries
 of manufacturing (SIC) into industries of using (SIC). Specifically there are five steps 

    - A. Import concordance between SIC and IO table industries for the MAKE table
    - B. Map the MAKE table into SIC and update value_make
    - C. Import concordance between SIC and IO table industries for the FLOW table
    - D. Map the FLOW table into SIC and update value_flow
    - E. Match the MAKE and FLOW table
*/

**************************************
* A. Build IO make to SIC4 concordance MAKE
**************************************

* 1) Map make industries to NAICS6

* Import and clean IO make industries to naics6 1997 concordance, 
*Bureau of Economic Analysis (BEA). n.d. "Historical Benchmark Input-Output Tables."
* Accessed November 2018. https://www.bea.gov/industry/historical-benchmark-input-output-tables.

import excel ${alm_data_raw}/bea/ndn0306/NAICS-IO.xls, sheet("A") firstrow allstr clear 

*renaming and cleaning
ren (IOIndustrynumber IOIndustrytitle Related1997NAICScodes D E F G H I J) (ind_make_IO ind_title c1 c2 c3 c4 c5 c6 c7 c8)
* drops the top level categories
drop if ind_make_IO == "" 
reshape long c, i(ind_make_IO) j(naics6_nr)
rename c naics6
*drop empty values (correspond to the emptxy columns in the excel)
drop if naics6 == ""
* some industries have e.g., 23* as code, correct formatting
replace naics6 = subinstr(naics6, "*","",.)  
replace naics6 = strtrim(naics6 + substr("000000", 1, 6 - length(naics6)))
*count how often a code appears
duplicates tag ind_make_IO, gen(ind_dup)
gen ind_count = 1 + ind_dup
bys ind_make_IO : egen naics6_by_ind = max(ind_count)
keep ind_make_IO ind_title naics6 naics6_by_ind
tab naics6_by_ind

label var ind_make_IO "IO industry code (making)"
label var naics6 "NAICS6 code"
label var naics6_by_ind "Number of related NAICS6 codes"

tempfile io_naics6
save "`io_naics6'"
* 74% of the industries have a 1:1 mapping. For the others: build weights using employment data from the CBP in 1998
 
* Import and clean CBP employment data in 1998, match with the concordance, generate weight
* County Business Patterns Database by the US CEnsus Bureau. imported here is a more complete version with 
*imputed values by Fabian Eckert, Teresa C. Fort, Peter K. Schott, and Natalie J. Yang.
* Eckert, Fabian, Teresa C. Fort, Peter K. Schott, and Natalie J. Yang. 2021.
* "Imputing Missing Values in the US Census Bureau's County Business Patterns." NBER Working Paper #26632. Accessed December 2019. http://fpeckert.me/cbp/
import delimited ${alm_data_raw}/concordances/efsy_1998/1998/cbp98us.txt, clear 
keep naics emp empflag
sort naics
* drops total employment
drop if strpos(naics, "-----")
* correct the flagged ind's with zero employments (distribute residual empl fractionally from aggregate)
list naics emp if empflag != ""

*485112, 485119 are empty use 48511 - 485111 - 485113 s employment to compute the remaining employment in the code, 
*split evenly between the two
*cannot really do in loop without much code due to the "/"" in "48511/"
gen emp_48511 = emp if naics == "48511/"
summarize emp_48511, meanonly
replace emp_48511 = r(mean)
gen emp_485111 = emp if naics == "485111"
summarize emp_485111, meanonly
replace emp_485111 = r(mean)
gen emp_485113 = emp if naics == "485113"
summarize emp_485113, meanonly
replace emp_485113 = r(mean)
replace emp = (emp_48511-emp_485111-emp_485113)/2 if inlist(naics, "485112", "485119")
drop emp_48511 emp_485111 emp_485113
*distribute 4889 evenly to 48991 and 48999
gen emp_4889 = emp if naics == "4889//"
summarize emp_4889, meanonly
replace emp_4889 = r(mean)
replace emp = emp_4889/2 if inlist(naics, "488991", "488999")
drop empflag emp_4889

*clean up the naics6 to 6 digit codes with trailing zeros
replace naics=subinstr(naics,"-","0",.)
replace naics=subinstr(naics,"/","0",.)
ren naics naics6

* drop the duplicates we generated by including zeros instead of .
duplicates drop naics6 emp, force
* 2 pairs of naics6's are split; combine them
duplicates list naics6 
bys naics6: egen emp_tot = sum(emp)
bys naics6: keep if _n == 1
replace emp = emp_tot

label var naics6 "NAICS6 code"
label var emp "Employment"
label var emp_tot "Total employment"

tempfile cbp98us_pre
save "`cbp98us_pre'"

* Aggregate employment for 451000, 452000, 453000, 454000 at 450000
replace naics6= "450000" if naics6 == "451000"
* adding 452000, 453000, 454000
foreach naics in 452000 453000 454000 {
    gen emp_`naics' = emp if naics6 == "`naics'"
    summarize emp_`naics', meanonly
    replace emp_`naics' = r(mean)
    replace emp = emp + emp_`naics' if naics6 == "450000"
    drop emp_`naics'
}
destring naics6, replace
drop if inrange(naics6,450001, 459999)
tostring naics6, replace
tempfile cbp98us
save "`cbp98us'"

use "`io_naics6'", clear
joinby naics6 using "`cbp98us'", unmatched(both)
sort naics6
drop if _merge==2
rename _merge _merge_CBP

* generate employment-based weight
egen emp_by_ind =total(emp) if emp != 0, by(ind_make_IO)  
gen weight_emp = 1 if naics6_by_ind == 1
replace weight_emp = emp/emp_by_ind if naics6_by_ind > 1
drop emp_by_ind naics6_by_ind
label var weight_emp "Employment based weight"
label var _merge_CBP "CBP merged into NAICS6-IO"

tab naics if emp != emp_tot

tempfile io_naics6_w_emp
save "`io_naics6_w_emp'", replace

* note that some non 1:1 mapping agricultural industries do not have employment data. We deal with it below

* 2) Map NAICS6 to SIC4

* import concordance naics6 1997 to SIC4 with empl weights
*(C1) NAICS97 6-digit to SIC87 4-digit. David Autor, David Dorn, and Gordon Hanson. 
*"The China Syndrome: Local Labor Market Effects of Import Competition in the United States."
* American Economic Review, 103(6), 2121-2168, 2013. Accessed December 2012. https://www.ddorn.net/data.htm.
use ${alm_data_raw}/concordances/cw_n97_s87.dta, clear
*drop auxiliary industries and reshuffle weights bc with the drop they do not sum up to 1 anymore
drop if sic4 >= 20001 
egen tot =total(weight), by(naics6)
gen weight_sic4 = weight/tot
drop tot weight
tostring naics6, replace

label var naics6 "NAICS97 code"
label var sic4 "SIC4 code (1987)"
label var weight_sic4 "Employment based weight"

tempfile naics6_sic4
save "`naics6_sic4'", replace

* merge with IO-NAICS6
use "`io_naics6_w_emp'"
joinby naics6 using "`naics6_sic4'", unmatched(both)
* keep the cases where we again have to assing CBP employment from 1998
keep if _merge!=3 
destring naics6, replace
* drop agriculture
drop if inrange(naics6, 111110, 112900) 
tostring naics6, replace
keep naics6 _merge
duplicates drop naics6 _merge, force
sort naics6 

* assign naics6 to aggragted codes (if no data)
gen naics6_top = naics6 if _merge==1
gen agg_naics6 = ""
gen naics_length = length(reverse(substr(reverse(naics6), indexnot(reverse(naics6), "0"),.)))
forvalues d = 5(-1)2 { 
    gen naics6_`d' = substr(naics6,1,`d') + substr("000000", 1, 6 - `d')
    gen tag_`d' = 1 if naics6_top == naics6_`d' & naics_length == `d'
    egen tot_`d' = max(tag_`d'), by(naics6_`d')
    replace agg_naics6 = naics6_`d' if tot_`d' == 1
    drop naics6_`d' tot_`d' tag_`d'
}
drop if naics6 == "990000" 
drop if _merge == 1
drop naics6_top _m naics_length

* use CBP employment weights again to map aggregataed NAICS6's into more disaggregated codes
joinby naics6 using "`cbp98us_pre'", unmatched(both)
sort naics6
assert _merge!=1
drop if _merge==2
drop _merge
egen empy_by_agg_naics6 =total(emp), by(agg_naics6)
gen weight_emp_disagg =emp/empy_by_agg_naics6
drop empy_by_agg_naics6 emp

label var agg_naics6 "Aggregated NAICS6 code"
label var weight_emp_disagg "Employment based weight"

tempfile naics6_sic4_disaggr
save "`naics6_sic4_disaggr'", replace

* apply disaggreagted mapping
use "`io_naics6_w_emp'", clear
rename naics6 agg_naics6
joinby agg_naics6 using "`naics6_sic4_disaggr'", unmatched(both)
assert _merge!=2
rename _merge _merge_disaggr

replace naics6 = agg_naics6 if _merge_disagg == 1 & _merge_CBP ==3
replace weight_emp_disagg = 1 if _merge_disagg == 1  

* merge with mapping into SIC from raw Dorn file
joinby naics6 using "`naics6_sic4'", unmatched(both)
drop if _merge == 2
ren _merge _merge_sic4
sort ind_make_IO agg_naics6 naics6 sic4

* 4) correct residual special cases, apply weights to do mapping

* TODO: comment jf. this is to redo as well; but I need to move forward in the interest of time. 
* I just made the renames (from weight_2, merge_2, temp4 etc that are terrible) to make it work but no further

* First, three naics6 industries do not match in this concordance table 
* --> use concordance from "1997_naics6_to_1987_SIC.xls" from Census Webpage
*enter by hand and divide equally (agriculture done more below!)
* https://www.census.gov/naics/concordances/1987_SIC_to_1997_NAICS.xls

* destring naics6, replace
expand 2 if agg_naics6== "482000"
* sort naics6
bys agg_naics6: gen dup = cond(_N==1,0,_n) if agg_naics6== "482000"
replace sic4 = 4011 if dup ==1
replace sic4 = 4013 if dup ==2
drop dup
replace weight_sic4 = 0.5 if  agg_naics6== "482000"
replace _merge_sic4 = 3 if  agg_naics6== "482000"

expand 2 if agg_naics6== "491000"
bys agg_naics6: gen dup = cond(_N==1,0,_n) if agg_naics6== "491000"
replace sic4=4311 if dup ==1
replace sic4=7389 if dup ==2
drop dup
replace weight_sic4 = 0.5 if agg_naics6== "491000"
replace _merge_sic4= 3 if agg_naics6== "491000"

replace sic4= 8811 if agg_naics6 == "814000"
replace weight_sic4 =1 if agg_naics6 == "814000"
replace _merge_sic4= 3 if agg_naics6 == "814000"


* do mapping ind_make_IO into sic4 using the weights (except the manual corrections for agriculture after)
sort ind_make_IO agg_naics6 naics6 sic4
gen weight_combined = weight_emp * weight_emp_disagg * weight_sic4 if _merge_sic4 == 3
egen w_IO_make_SIC = total(weight_combined) if _merge_sic4 ==3 ,by(ind_make_IO sic4) 
duplicates drop ind_make_IO sic4 if _merge_sic4, force

*Second, the first few industries (part of agriculture) we do not have employment
*weights at the CBP and also no weighted mapping into sic4 from Dorn.
*However they map mostly into sic4 100 and 200 --> so aggregate sic4 to this
*level and assign ind_make_IO reasonably
*Look also at file 1997_naics6_to_1987_SIC.xls" from Census Webpage for reference

*Some ind_make_IO map definitely into sic 100
*111400 also some part in 811 --> ignore
*1119B0 also some part else --> ignore
foreach x in 1111A0 1111B0 111200 111335 1113A0 111400 111910 111920 1119A0 1119B0 {
    duplicates drop ind_make_IO if ind_make_IO=="`x'", force
    replace sic4=100 if ind_make_IO=="`x'"
    replace w_IO_make_SIC=1 if ind_make_IO=="`x'"
}

*Some ind_make_IO map definitely into sic 200
*112A00 also some part in 921 --> ignore
foreach x in 112100 112300 112A00 {
    duplicates drop ind_make_IO if ind_make_IO=="`x'", force
    replace sic4=200 if ind_make_IO=="`x'"
    replace w_IO_make_SIC=1 if ind_make_IO=="`x'"
}

* check weights, shgould maximally have rounding errors
egen test=total(w_IO_make_SIC) if _merge_sic4==3, by(ind_make_IO)
assert test > 0.99999 & test < 1.000001 if _merge_sic4==3
drop test

* save concordance
keep ind_make_IO sic4 w_IO_make_SIC

*how much of an IO make employment goes towards a SIC4 code
label var w_IO_make_SIC "relative employment of make table IO going towards the sic4 code"

tempfile ind_make_IO_sic4
save "`ind_make_IO_sic4'", replace


***********************************
* B. Buid make table
***********************************

*Import and clean the MAKE table "Benchmark Input-Output Accounts of the United States, 1997," SURVEY OF CURRENT BUSINESS (December 2002); 
import delimited ${alm_data_raw}/bea/ndn0306/NAICSMakeDetail.txt, clear 
rename (v1 v2 v4) (ind_make_IO com make_value)
label var com "Commodity code"
label var ind_make_IO "IO industry code (making)"
label var make_value "Value of make table"
drop v3

* apply our concordance from the previous step
joinby ind_make_IO using "`ind_make_IO_sic4'", unmatched(both)
drop if _merge==1
assert _merge==3
drop _merge

* update make_value at SIC level
gen multiply = make_value*w_IO_make_SIC
egen make_value_SIC = total(multiply), by (sic4 com)
drop multiply
duplicates drop sic4 com, force

* generate the make weight
keep com sic4 make_value_SIC
rename sic4 sic_ind_make
egen total_value_make = total(make_value_SIC), by(sic_ind_make)
gen w_make = make_value_SIC / total_value_make
drop make_value_SIC

*rename some commodities which do not occur like this in the flow table
*e.g. com 2122A0 does occur in the make table but not in the flow table, in 
*the flow table we have com 212291. But according to the concordance of IO and
*NAICS 1997 the com 2122A0 maps into NAICS 212290 and 212220. (So com 2122A0 
*there). Reasoning: we are focusing on machinery patents, taking our assumptions 
*seriously, this means that all our patents should be associated with a 
*commodity that is sold as equipment. So if a patent is somehow linked to 
*commodity 2221A0 then it should be in the part that can be used as equipment, 
*namely 212291.

replace com="212291" if com=="2122A0"
replace com="334519" if com=="33451A"
replace com="335931" if com=="335930"
replace com="336360" if com=="336300"
replace com="337125" if com=="33712A"
replace com="339942" if com=="339940"
replace com="339999" if com=="33999A"
replace com="531210" if com=="531000"
replace com="541330" if com=="541300"

label var com "Commodity code"
label var sic_ind_make "SIC4 code"
label var w_make "relative value of commodity make (make_value_SIC)/total_value_make by sic4 and commodity"
label var total_value_make "employment weighted make account for sic4 code"

tempfile sic_ind_make_value
save "`sic_ind_make_value'", replace 

**************************************
* C. Build IO to SIC4 concordance FLOW (very much analogous to the IO MAKE in A.)
**************************************

* import and clean concordance 123 IO industries to NAICSx 1997
*Bureau of Economic Analysis (BEA). 2003b. "Capital Flow Data for 1997." 
*Accessed November 2018. https://www.bea.gov/sites/default/files/newsreleases/industry/capflow/2003/xls/flow1997.xls.
import excel ${alm_data_raw}/bea/flow1997.xls, sheet("Ind123") firstrow allstr clear 
drop if _n > 123
drop E ColumnNumber
ren (IndustryCode IndustryTitle Related1997NAICSCodes) (ind_flow_IO ind_title rel_naics_codes) 
split rel_naics_codes, parse(",") generate(c)
drop rel_naics_codes
reshape long c, i(ind_flow_IO) j(naics6_nr)
rename c naics6
drop if naics6 == "" 
replace ind_flow_IO = strtrim(ind_flow_IO)
replace ind_title = stritrim(ind_title)
* some industries have e.g., 23* as code, correct 
replace naics6 = strtrim(subinstr(naics6, "*","",.))
* and the lines begin with an apostrophe
replace naics6 = subinstr(naics6, char(39),"", .) 
gen delimpos = strpos(naics6,"-")  

* some obs indicate codes with a range and a - between
gen from = substr(naics6, delimpos-1, 1) if delimpos != 0
destring from, replace
gen to = substr(naics6, delimpos + 1, delimpos + 1) if delimpos != 0
destring to, replace
gen range = to - from if delimpos != 0
expand (range+1)  if delimpos != 0
bys naics6: gen nr = _n+from-1 if delimpos != 0
sort naics6
tostring nr, replace

replace naics6 = substr(naics6, 1, delimpos-2) + nr if delimpos != 0
replace naics6 = substr(naics6, 1, 6) + substr("000000", 1, 6 - length(naics6))
duplicates tag ind_flow_IO, gen(ind_dup)
gen ind_count = 1 + ind_dup
keep ind_flow_IO ind_title ind_count naics6 
tab ind_count

label var naics6 "NAICS6 code"
label var ind_count "Number of related NAICS6 codes of flow IO code"

tempfile io_flow_naics6
save "`io_flow_naics6'" 

*For cases where not an 1:1 mapping --> apply employment weights from CBP again
joinby naics6 using "`cbp98us'", unmatched(both)
sort naics6
drop if _merge==2 
*naics6 482000 no employment data but mapping anyway 1:1
replace emp=1 if naics6== "482000"
replace _merge=3 if naics6== "482000"
*the first agricultural industries also have no employment --> deal with them below
* generate the weight into naics6
egen emp_by_ind = total(emp), by(ind_flow_IO) 
gen weight_emp = emp/emp_by_ind 
drop emp_by_ind 
rename _merge _merge_CBP

label var _merge_CBP "CBP merged into NAICS6-IO"
label var weight_emp "Employment based weight for IO code"

tempfile io_naics6_flow_w_emp
save "`io_naics6_flow_w_emp'", replace 
*Preliminary: Match concorance IO NAICS 1997 with concordance NAICS 1997 to SIC 
joinby naics6 using "`naics6_sic4'", unmatched(both) 
sort naics6

* keep the cases where we again have to assing CBP employment from 1998. *
keep if _merge! = 3 
destring naics6, replace
* drop agriculture
drop if inrange(naics6, 111100, 112900) 
tostring naics6, replace
keep naics6 _merge
duplicates drop naics6 _merge, force
sort naics6 
* assign naics6 to aggragted codes 
gen naics6_top = naics6 if _merge==1
gen agg_naics6 = ""
gen naics_length = length(reverse(substr(reverse(naics6), indexnot(reverse(naics6), "0"),.)))
forvalues d = 5(-1)2 { 
    gen naics6_`d' = substr(naics6,1,`d') + substr("000000", 1, 6 - `d')
    gen tag_`d' = 1 if naics6_top == naics6_`d' & naics_length == `d'
    egen tot_`d' = max(tag_`d'), by(naics6_`d')
    replace agg_naics6 = naics6_`d' if tot_`d' == 1
    drop naics6_`d' tot_`d' tag_`d'
}
drop if naics6 == "990000" 
drop if _merge == 1
drop naics6_top _m naics_length

* use CBP employment weights again to map aggregataed NAICS6's into more disaggregated codes
joinby naics6 using "`cbp98us_pre'", unmatched(both)
sort naics6
assert _merge!=1
drop if _merge==2
drop _merge

egen emp_by_agg_naics6 = total(emp), by(agg_naics6)
gen weight_emp_disagg =emp/emp_by_agg_naics6
drop emp_by_agg_naics6 emp

label var agg_naics6 "Aggregated NAICS6 code"
label var weight_emp_disagg "Employment based weight"

tempfile naics6_sic4_disaggr
save "`naics6_sic4_disaggr'", replace

* apply disaggreagted mapping
use "`io_naics6_flow_w_emp'", clear
rename naics6 agg_naics6
joinby agg_naics6 using "`naics6_sic4_disaggr'", unmatched(both)
assert _merge!=2 
rename _merge _merge_disaggr

replace naics6 = agg_naics6 if _merge_disagg == 1 & _merge_CBP ==3
replace weight_emp_disagg =1 if _merge_disagg == 1 

* merge with mapping into SIC from raw Dorn file
joinby naics6 using "`naics6_sic4'", unmatched(both)
drop if _merge == 2
ren _merge _merge_sic4
sort ind_flow_IO agg_naics6 naics6 sic4

*one NAICS industry (482000) no match in this concordance table --> use concordance
*from "1997_NAICS_to_1987_SIC.xls" from Census Webpage
*enter by hand and divid equally (agriculture done further below differently!)
expand 2 if agg_naics6 == "482000"
bys agg_naics6:  gen dup = cond(_N==1,0,_n) if agg_naics6== "482000"
replace sic4 = 4011 if dup == 1
replace sic4 = 4013 if dup == 2
drop dup
replace weight_sic4 = 0.5 if agg_naics6 == "482000"
replace _merge_sic4 = 3 if agg_naics6 == "482000"

assert _merge_CBP == _merge_sic4 if _merge_sic4 ==3
sort ind_flow_IO agg_naics6 naics6 sic4

*do mapping ind_make_IO into sic4 using the weights (except the manual corrections for agriculture after)
sort ind_flow_IO agg_naics6 naics6 sic4
gen weight_combined = weight_emp * weight_emp_disagg * weight_sic4 if _merge_sic4 == 3
egen w_IO_flow_SIC = total(weight_combined) if _merge_sic4 ==3 ,by(ind_flow_IO sic4) 
duplicates drop ind_flow_IO sic4 if _merge_sic4, force

*For the first few industries (part of agriculture) we do not have employment
*weights at the CBP and also no weighted mapping into sic4 from Dorn
*So just assign them to the broadest SIC categories 0100 and 0200

*1110 goes mostly into 100
foreach x in 1110 { 
    duplicates drop ind_flow_IO if ind_flow_IO=="`x'", force
    replace sic4=100 if ind_flow_IO=="`x'"
    replace w_IO_flow_SIC=1 if ind_flow_IO=="`x'"
}

*1120 goes mostly into 200
foreach x in 1120 { 
    duplicates drop ind_flow_IO if ind_flow_IO=="`x'", force
    replace sic4=200 if ind_flow_IO=="`x'"
    replace w_IO_flow_SIC=1 if ind_flow_IO=="`x'"
}

* save concordance
keep ind_flow_IO sic4 w_IO_flow_SIC

label var w_IO_flow_SIC "relative employment of flow table IO going towards the sic4 code"

save ${alm_data_proc}/ind_flow_IO_sic4_concordance.dta, replace 

*********************************************
* D. Map Flow table into ind-sic4 concordance
*********************************************

* import and reshape flow table
*Bureau of Economic Analysis (BEA). 2003b. "Capital Flow Data for 1997."
* Accessed November 2018. https://www.bea.gov/sites/default/files/newsreleases/industry/capflow/2003/xls/flow1997.xls.
import excel ${alm_data_raw}/bea/flow1997.xls, sheet("180x123Combined") cellrange(B4:DV184) clear
drop C
foreach var of varlist *{
    label variable `var' "`=`var'[1]'"
}

local i=-1
foreach var of varlist *{
    local i = `i' + 1
    rename `var' industry`i'
}
rename industry0 com
drop if com=="Commodity"
reshape long industry, i(com) j(index_ind_flow)
rename industry value_flow
destring value_flow, replace
replace com = strtrim(com)

label var index_ind_flow "Index of flow industry"
label var value_flow "Value of flow table"

tempfile flow_tab_clean
save "`flow_tab_clean'", replace

* add flow industry codes (i.e. the columns)
*Bureau of Economic Analysis (BEA). 2003b. "Capital Flow Data for 1997." 
*Accessed November 2018. https://www.bea.gov/sites/default/files/newsreleases/industry/capflow/2003/xls/flow1997.xls.
import excel ${alm_data_raw}/bea/flow1997.xls, sheet("180x123Combined") cellrange(D3:DV4) clear
foreach v of varlist * {
   local vname = strtoname(`v'[1])
   rename `v' `vname'
}
drop in 1
gen industry = "industry"
reshape long _, i(industry) j(index_ind_flow)
ren _ ind_flow
drop industry
tempfile ind_codes
save "`ind_codes'", replace

*merge header into prepared format
use "`flow_tab_clean'", clear
merge m:1 index_ind_flow using "`ind_codes'"
assert _merge==3
drop _merge index_ind_flow
sort com ind_flow

label var ind_flow "Flow industry code"

tempfile flow_tab_clean_wc
save "`flow_tab_clean_wc'", replace

* Import and add the published corrections to the flow table
* Bureau of Economic Analysis (BEA). 2003a. "Corrections to the 1997 Capital Flow Estimates."
* Accessed November 2018. https://www.bea.gov/industry/corrections-1997-capital-flow-estimates.
import excel ${alm_data_raw}/bea/changes_to_180x123combined.xls, sheet("Changes to 180x123Combined") cellrange(B3:H104) clear
drop C
foreach var of varlist *{
    label variable `var' "`=`var'[1]'"
}
local i=-1
foreach var of varlist *{
    local i = `i' + 1
    rename `var' industry`i'
}
rename industry0 com
drop if com=="Commodity"
reshape long industry, i(com) j(index_ind_flow)
rename industry value_flow
destring value_flow, replace
replace com = strtrim(com) 
replace value_flow=0 if value_flow==.

label var index_ind_flow "Index of flow industry"
label var value_flow "Value of flow table"

tempfile flow_tab_corr_clean
save "`flow_tab_corr_clean'", replace

* Bureau of Economic Analysis (BEA). 2003a. "Corrections to the 1997 Capital Flow Estimates."
* Accessed November 2018. https://www.bea.gov/industry/corrections-1997-capital-flow-estimates.
import excel ${alm_data_raw}/bea/changes_to_180x123combined.xls, sheet("Changes to 180x123Combined") cellrange(D3:H3) clear
ren (D E F G H) (_1 _2 _3 _4 _5)
gen industry = "industry"
reshape long _, i(industry) j(index_ind_flow, string)
ren _ ind_flow
destring index_ind_flow, replace
drop industry

label var ind_flow "Flow industry code"
label var index_ind_flow "Index of flow industry"
tempfile ind_codes_corr
save "`ind_codes_corr'", replace
use "`flow_tab_corr_clean'", clear

*merge together to get a full file (i.e. merge the header into the format like for the avtual main table)
merge m:1 index_ind_flow using "`ind_codes_corr'"
assert _merge==3 
drop _merge index_ind_flow
sort com ind_flow
rename value_flow value_flow_corr

tempfile flow_tab_corr_clean_wc
save "`flow_tab_corr_clean_wc'", replace

*merge corrections into the original flow table
use "`flow_tab_clean_wc'", clear

merge 1:1 com ind_flow using "`flow_tab_corr_clean_wc'"
replace value_flow = value_flow_corr if _merge==3
drop _merge value_flow_corr
rename ind_flow ind_flow_IO

* Map into SiC using the concordance from the previous step
joinby ind_flow_IO using ${alm_data_proc}/ind_flow_IO_sic4_concordance.dta, unmatched(both)
assert _merge==3
drop _m
sort com ind_flow_IO sic4

gen w_value_flow = value_flow * w_IO_flow_SIC
egen value_flow_SIC=total(w_value_flow), by(com sic4)
duplicates drop com sic4, force
keep com sic4 value_flow_SIC
rename value_flow_SIC flow_value_SIC
rename sic4 sic_ind_flow

*drop construction commodities --> cannot be matched with make table
drop if inlist(com,"233511","233512","233513","233514","233515","233516","233517")
drop if inlist(com,"233523","233524","233525","233526","233527","233528","233529")
drop if inlist(com,"233532","233533","233534","233535","233536","233543","233544")
drop if inlist(com,"233545","233551","233552","233572","233573","233574","233612")
drop if inlist(com,"233621","233624","233625","233631","233701","233703","233706")
drop if inlist(com,"233722")

*drop noncomparable imports
drop if inlist(com,"S00300")

*generate the flow weight
egen flow_value_SIC_by_com = total(flow_value_SIC), by(com)
gen w_flow = flow_value_SIC/flow_value_SIC_by_com
egen tot_flow_by_com =total(w_flow), by(com)

* commodity 321920 is not invested in any industry --> drops
drop if tot_flow_by_com == 0 
keep com sic_ind_flow w_flow
label var com "Commodity code"
label var sic_ind_flow "SIC4 code"
label var w_flow "relative value of flow"

tempfile ind_flow_IO_sic4
save "`ind_flow_IO_sic4'", replace

*********************************************
* E. Match the Make and Flow table
*********************************************

*Import the MAKE table 
use "`sic_ind_make_value'", clear

*Match with FLOW table
joinby com using "`ind_flow_IO_sic4'", unmatched(both)
* 4.93 mio obs; 476 distinct coms
assert _merge!=2  

*Update weight

*Handle the commodities witch map into a dead end - keep here and delete later, then reshuffle
replace w_flow= 1 if _merge==1 
gen w_multiplied = w_make * w_flow
egen weight_make_flow = total(w_multiplied), by(sic_ind_make sic_ind_flow)
duplicates drop sic_ind_make sic_ind_flow, force
egen test=total(weight),by(sic_ind_make)
assert test>0.99998 & test<10001
drop _m

*fillin data set and assign a weight of zero to the filled in observations
*important to take average of some weights in next section
fillin sic_ind_make sic_ind_flow
replace weight_make_flow=0 if _fillin==1
drop _fillin
sort sic_ind_make sic_ind_flow

*correct a fault
replace sic_ind_make=8050 if sic_ind_make==8053
replace sic_ind_flow=8050 if sic_ind_flow==8053
tostring sic_ind_make, gen(make)
replace make="0" + make if sic_ind_make<1000
tostring sic_ind_flow, gen(flow)
replace flow="0" + flow if sic_ind_flow<1000

drop sic_ind_make sic_ind_flow
rename make sic_ind_make
rename flow sic_ind_flow

*handling missing values per sic make industry
egen min_tvm =min(total_value_make), by(sic_ind_make)
replace total_value_make=min_tvm if total_value_make==.
drop min_tvm

keep sic_ind_make sic_ind_flow total_value_make weight_make_flow 
order sic_ind_make sic_ind_flow total_value_make weight_make_flow 
sort sic_ind_make

label var weight_make_flow "employment weighted make value for commodity flow in sic4 code"

save ${alm_data_proc}/sic4_make_flow.dta, replace


}
if _rc == 0 {
    display "Execution finished successfully."
}
else {
    display "Execution finished with errors."
}

cap log close dat